#Air France Case Study Assignment 
# First, we load all the necessary libraries
library(readxl)
## Warning: package 'readxl' was built under R version 4.2.2
library(qdap)
## Warning: package 'qdap' was built under R version 4.2.3
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Warning: package 'qdapRegex' was built under R version 4.2.3
## Loading required package: qdapTools
## Warning: package 'qdapTools' was built under R version 4.2.3
## Loading required package: RColorBrewer
## 
## Attaching package: 'qdap'
## The following objects are masked from 'package:base':
## 
##     Filter, proportions
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:qdap':
## 
##     ngrams
## 
## Attaching package: 'tm'
## The following objects are masked from 'package:qdap':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
library(plotrix)
library(dendextend)
## Warning: package 'dendextend' was built under R version 4.2.3
## 
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdapTools':
## 
##     id
## The following object is masked from 'package:qdapRegex':
## 
##     explain
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(plotly)
## Warning: package 'plotly' was built under R version 4.2.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.2.2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
## The following object is masked from 'package:qdapRegex':
## 
##     %+%
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   0.3.5
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## Warning: package 'tibble' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()      masks qdapRegex::%+%()
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::explain()    masks qdapRegex::explain()
## ✖ plotly::filter()    masks dplyr::filter(), stats::filter()
## ✖ dplyr::id()         masks qdapTools::id()
## ✖ dplyr::lag()        masks stats::lag()
# Next, we read in the dataset from an Excel file
AirFrance <- read_excel("C:/Users/User/Desktop/Air France Dataset.xlsx")

# We check for missing values in the dataset
# and replace blank cells with NA values
missing_vals <- sapply(AirFrance, function(x) sum(is.na(x)))
AirFrance[AirFrance==""] <- NA

# We create a new column for ROA as a KPI 
# by dividing the "Amount" column by the "Total Cost" column
AirFrance <- within(AirFrance, ROA <- Amount / `Total Cost`)

# We identify the unique publisher names in the dataset
publisher_names <- unique(AirFrance$`Publisher Name`)
publisher_names
## [1] "Yahoo - US"        "MSN - Global"      "Google - Global"  
## [4] "Overture - Global" "Google - US"       "Overture - US"    
## [7] "MSN - US"
# We tally the total sales for each publisher
sales_by_publisher <- c()
for (i in 1:length(publisher_names)) {
  # We subset the dataset to include only rows with the current publisher name
  publisher_sales <- AirFrance$Amount[which(AirFrance[, 2] == publisher_names[i])]
  # We sum up the sales for the current publisher and add it to our sales vector
  sales_by_publisher <- c(sales_by_publisher, sum(publisher_sales))
  i <- i + 1
}


# We combine the publisher names and their corresponding sales figures into a single dataframe
publisher_sales_df <- cbind(publisher_names, as.numeric(sales_by_publisher))
publisher_sales_df
##      publisher_names                
## [1,] "Yahoo - US"        "882288.95"
## [2,] "MSN - Global"      "145524.25"
## [3,] "Google - Global"   "929549.8" 
## [4,] "Overture - Global" "430084.7" 
## [5,] "Google - US"       "1745481.8"
## [6,] "Overture - US"     "347433.25"
## [7,] "MSN - US"          "181549.8"
# We identify the publisher with the highest sales
max_sales <- max(sales_by_publisher)
top_publisher <- publisher_names[which(sales_by_publisher == max_sales)]
top_publisher
## [1] "Google - US"
#We can now explore Publisher Strategy i.e The Most Effective Channel 
#We can start by creating a new dataframe for analysis
New_AirFrance <- AirFrance
# We calculate basic statistics for each variable of interest
Statistics <- c("Mean", "Median", "SD", "Min", "Max")
Amount_stats <- round(c(mean(New_AirFrance$Amount), median(New_AirFrance$Amount), sd(New_AirFrance$Amount), min(New_AirFrance$Amount), max(New_AirFrance$Amount)), 2)

# Calculate summary statistics for the Total Cost column
Total_Cost_stats <- round(c(mean(New_AirFrance$`Total Cost`), median(New_AirFrance$`Total Cost`), sd(New_AirFrance$`Total Cost`), min(New_AirFrance$`Total Cost`), max(New_AirFrance$`Total Cost`)), 2)

Impressions_stats <- round(c(mean(New_AirFrance$Impressions), median(New_AirFrance$Impressions), sd(New_AirFrance$Impressions), min(New_AirFrance$Impressions), max(New_AirFrance$Impressions)), 2)
Clicks_stats <- round(c(mean(New_AirFrance$Clicks), median(New_AirFrance$Clicks), sd(New_AirFrance$Clicks), min(New_AirFrance$Clicks), max(New_AirFrance$Clicks)), 2)

# We combine the statistics for each variable into a summary dataframe
Summary <- as.data.frame(cbind(Statistics, Amount_stats, Total_Cost_stats, Impressions_stats, Clicks_stats))

# We remove any rows with Total Cost = 0, as this will cause an infinite value when calculating ROA
New_AirFrance <- New_AirFrance[New_AirFrance$`Total Cost` != 0,]

# We can now create new columns for revenue, ROA, booking probability, cost per booking, and average revenue per booking
New_AirFrance$Revenue <- New_AirFrance$Amount - New_AirFrance$`Total Cost`
New_AirFrance$ROA <- round(New_AirFrance$Revenue / New_AirFrance$`Total Cost`, 2)
New_AirFrance$Book_Prob <- round((New_AirFrance$`Trans. Conv. %` * New_AirFrance$`Engine Click Thru %`) / 100, 2)
New_AirFrance$Cost_Book <- round(New_AirFrance$`Total Cost` / New_AirFrance$`Total Volume of Bookings`, 2)
New_AirFrance[New_AirFrance == ""] <- 0
New_AirFrance[New_AirFrance == "Inf"] <- 0
New_AirFrance$Average_Revenue_Booking <- round(New_AirFrance$Amount / New_AirFrance$`Total Volume of Bookings`, 2)

# We can check the summary statistics of our new dataframe
summary(New_AirFrance)
##  Publisher ID       Publisher Name      Keyword ID          Keyword         
##  Length:4509        Length:4509        Length:4509        Length:4509       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   Match Type          Campaign         Keyword Group        Category        
##  Length:4509        Length:4509        Length:4509        Length:4509       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Bid Strategy       Keyword Type          Status          Search Engine Bid
##  Length:4509        Length:4509        Length:4509        Min.   : 0.000   
##  Class :character   Class :character   Class :character   1st Qu.: 3.275   
##  Mode  :character   Mode  :character   Mode  :character   Median : 6.250   
##                                                           Mean   : 5.434   
##                                                           3rd Qu.: 6.250   
##                                                           Max.   :27.500   
##                                                                            
##      Clicks        Click Charges      Avg. Cost per Click  Impressions     
##  Min.   :    1.0   Min.   :    0.05   Min.   : 0.0392     Min.   :      1  
##  1st Qu.:    1.0   1st Qu.:    2.31   1st Qu.: 0.8250     1st Qu.:     28  
##  Median :    4.0   Median :    6.76   Median : 1.6510     Median :    176  
##  Mean   :  113.7   Mean   :  167.51   Mean   : 1.8907     Mean   :   9286  
##  3rd Qu.:   19.0   3rd Qu.:   28.50   3rd Qu.: 2.6625     3rd Qu.:    844  
##  Max.   :34012.0   Max.   :46188.44   Max.   :10.0000     Max.   :8342415  
##                                                                            
##  Engine Click Thru %   Avg. Pos.      Trans. Conv. %     Total Cost/ Trans.
##  Min.   :  0.01287   Min.   : 0.000   Min.   :  0.0000   Min.   :   0.00   
##  1st Qu.:  1.53489   1st Qu.: 1.143   1st Qu.:  0.0000   1st Qu.:   0.00   
##  Median :  4.10866   Median : 1.594   Median :  0.0000   Median :   0.00   
##  Mean   : 11.14392   Mean   : 1.930   Mean   :  0.5694   Mean   :  27.61   
##  3rd Qu.: 10.91954   3rd Qu.: 2.308   3rd Qu.:  0.0000   3rd Qu.:   0.00   
##  Max.   :200.00000   Max.   :15.000   Max.   :900.0000   Max.   :9597.17   
##                                                                            
##      Amount         Total Cost       Total Volume of Bookings
##  Min.   :     0   Min.   :    0.05   Min.   :  0.0000        
##  1st Qu.:     0   1st Qu.:    2.31   1st Qu.:  0.0000        
##  Median :     0   Median :    6.76   Median :  0.0000        
##  Mean   :  1034   Mean   :  167.51   Mean   :  0.8734        
##  3rd Qu.:     0   3rd Qu.:   28.50   3rd Qu.:  0.0000        
##  Max.   :567463   Max.   :46188.44   Max.   :439.0000        
##                                                              
##       ROA              Revenue           Book_Prob          Cost_Book      
##  Min.   :  -1.000   Min.   : -8725.9   Min.   : 0.00000   Min.   :   0.00  
##  1st Qu.:  -1.000   1st Qu.:   -18.9   1st Qu.: 0.00000   1st Qu.:   0.00  
##  Median :  -1.000   Median :    -5.0   Median : 0.00000   Median :   0.00  
##  Mean   :   3.415   Mean   :   866.3   Mean   : 0.06812   Mean   :  27.61  
##  3rd Qu.:  -1.000   3rd Qu.:    -1.6   3rd Qu.: 0.00000   3rd Qu.:   0.00  
##  Max.   :3794.870   Max.   :549524.1   Max.   :81.82000   Max.   :9597.17  
##                                                                            
##  Average_Revenue_Booking
##  Min.   :  34.0         
##  1st Qu.: 565.7         
##  Median : 900.1         
##  Mean   :1025.9         
##  3rd Qu.:1281.0         
##  Max.   :5877.8         
##  NA's   :4142
# Check data type of 'Match Type' column
typeof(New_AirFrance$`Match Type`)
## [1] "character"
# Check the first few rows of the new dataframe
head(New_AirFrance)
## # A tibble: 6 × 28
##   `Publisher ID` Publi…¹ Keywo…² Keyword Match…³ Campa…⁴ Keywo…⁵ Categ…⁶ Bid S…⁷
##   <chr>          <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
## 1 K2615          Yahoo … 430000… fly to… Advanc… Wester… Floren… uncate… <NA>   
## 2 K2615          Yahoo … 430000… low in… Advanc… Geo Ta… Low In… uncate… <NA>   
## 3 K2003          MSN - … 430000… air di… Broad   Air Fr… France  uncate… Positi…
## 4 K1175          Google… 430000… [airfr… Exact   Air Fr… Air Fr… airfra… Positi…
## 5 K1123          Overtu… 430000… air fr… Standa… Unassi… Unassi… airfra… Positi…
## 6 K1123          Overtu… 430000… airfra… Standa… Unassi… Unassi… airfra… Positi…
## # … with 19 more variables: `Keyword Type` <chr>, Status <chr>,
## #   `Search Engine Bid` <dbl>, Clicks <dbl>, `Click Charges` <dbl>,
## #   `Avg. Cost per Click` <dbl>, Impressions <dbl>,
## #   `Engine Click Thru %` <dbl>, `Avg. Pos.` <dbl>, `Trans. Conv. %` <dbl>,
## #   `Total Cost/ Trans.` <dbl>, Amount <dbl>, `Total Cost` <dbl>,
## #   `Total Volume of Bookings` <dbl>, ROA <dbl>, Revenue <dbl>,
## #   Book_Prob <dbl>, Cost_Book <dbl>, Average_Revenue_Booking <dbl>, and …
# We create Pivot Table to compare ROA and Avg. Cost per Click 
AFdf_pivot <- New_AirFrance %>% group_by(`Publisher Name`) %>% summarize(
  avg_ROA = mean(ROA),
  avg_cpc = mean(`Avg. Cost per Click`)
)
summary(AFdf_pivot)
##  Publisher Name        avg_ROA          avg_cpc      
##  Length:7           Min.   : 1.181   Min.   :0.7639  
##  Class :character   1st Qu.: 1.219   1st Qu.:1.4018  
##  Mode  :character   Median : 4.485   Median :2.1530  
##                     Mean   : 4.827   Mean   :1.8854  
##                     3rd Qu.: 7.518   3rd Qu.:2.3050  
##                     Max.   :10.649   Max.   :2.8675
#We Create the second Pivot Table For Bubble Chart to compare different SEM
AFdf_pivot2 <- New_AirFrance %>% group_by(`Publisher Name`) %>% summarize(
  total_records = n(),
  total_amount = sum(`Total Cost`),
  avg_cpc = mean(`Avg. Cost per Click`),
  avg_prob = mean(`Book_Prob`),
  avg_ROA = mean(ROA)
)
summary(AFdf_pivot2)
##  Publisher Name     total_records     total_amount       avg_cpc      
##  Length:7           Min.   :  98.0   Min.   : 12160   Min.   :0.7639  
##  Class :character   1st Qu.: 246.0   1st Qu.: 31148   1st Qu.:1.4018  
##  Mode  :character   Median : 553.0   Median : 64296   Median :2.1530  
##                     Mean   : 644.1   Mean   :107902   Mean   :1.8854  
##                     3rd Qu.: 648.0   3rd Qu.:131461   3rd Qu.:2.3050  
##                     Max.   :2070.0   Max.   :353641   Max.   :2.8675  
##     avg_prob          avg_ROA      
##  Min.   :0.00646   Min.   : 1.181  
##  1st Qu.:0.02718   1st Qu.: 1.219  
##  Median :0.05449   Median : 4.485  
##  Mean   :0.06896   Mean   : 4.827  
##  3rd Qu.:0.10063   3rd Qu.: 7.518  
##  Max.   :0.16614   Max.   :10.649
#Bubble Chart 
p <- plot_ly(AFdf_pivot2, x = ~avg_prob, y = ~avg_ROA,
             textposition = "auto",
             type = 'scatter', 
             mode = 'markers', 
             size = ~avg_cpc, 
             color = ~`Publisher Name`, 
             colors = 'Paired',
             marker = list(opacity = 0.8, sizemode = 'diameter')) %>%
  layout(title = 'Publisher Strategy',
         xaxis = list(title = "Probability of Booking", showgrid = TRUE),
         yaxis = list(title = "Average ROA", showgrid = TRUE),
         showlegend = TRUE)

p